{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from pandas import Series, DataFrame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Plate ID</th>\n",
       "      <th>Registration State</th>\n",
       "      <th>Vehicle Make</th>\n",
       "      <th>Street Name</th>\n",
       "      <th>Vehicle Color</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>J58JKX</td>\n",
       "      <td>NJ</td>\n",
       "      <td>HONDA</td>\n",
       "      <td>43 ST</td>\n",
       "      <td>BK</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>KRE6058</td>\n",
       "      <td>PA</td>\n",
       "      <td>ME/BE</td>\n",
       "      <td>UNION ST</td>\n",
       "      <td>BLK</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>444326R</td>\n",
       "      <td>NJ</td>\n",
       "      <td>LEXUS</td>\n",
       "      <td>CLERMONT AVENUE</td>\n",
       "      <td>BLACK</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>F728330</td>\n",
       "      <td>OH</td>\n",
       "      <td>CHEVR</td>\n",
       "      <td>DIVISION AVE</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>FMY9090</td>\n",
       "      <td>NY</td>\n",
       "      <td>JEEP</td>\n",
       "      <td>GRAND ST</td>\n",
       "      <td>GREY</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  Plate ID Registration State Vehicle Make      Street Name Vehicle Color\n",
       "0   J58JKX                 NJ        HONDA            43 ST            BK\n",
       "1  KRE6058                 PA        ME/BE         UNION ST           BLK\n",
       "2  444326R                 NJ        LEXUS  CLERMONT AVENUE         BLACK\n",
       "3  F728330                 OH        CHEVR     DIVISION AVE           NaN\n",
       "4  FMY9090                 NY         JEEP         GRAND ST          GREY"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "filename = '../data/nyc-parking-violations-2020.csv'\n",
    "\n",
    "df = pd.read_csv(filename,\n",
    "                 usecols=['Plate ID',  'Registration State',\n",
    "                        'Vehicle Make', 'Vehicle Color', 'Street Name'])\n",
    "\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "',Vehicle Color\\n2752511,RED\\n964568,BLUE\\n5049760,BK\\n4248515,GY\\n6899272,RED\\n11549116,BL\\n9816025,BK\\n11346931,WH\\n2772528,BK\\n3663790,WH\\n7179841,WH\\n661128,BLUE\\n5401291,BLACK\\n526821,WHITE\\n4014922,BK\\n6166605,RED\\n11905504,BLACK\\n428139,BK\\n5328844,WHITE\\n492842,SILVE\\n920426,BL\\n1979514,GY\\n2105457,WH\\n11996578,RD\\n3992380,WH\\n12480694,GY\\n1009122,GY\\n9539409,WH\\n11521484,BK\\n8954222,WHITE\\n'"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.random.seed(0)\n",
    "df['Vehicle Color'].sample(30).to_csv()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1896"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# How many different colors are there?\n",
    "len(df['Vehicle Color'].value_counts().index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "WH       2344858\n",
       "GY       2307704\n",
       "BK       2066374\n",
       "WHITE    1061234\n",
       "BL        775124\n",
       "RD        483298\n",
       "BLACK     465110\n",
       "GREY      306787\n",
       "BROWN     292348\n",
       "SILVE     191477\n",
       "GR        182929\n",
       "BLUE      178298\n",
       "RED       161693\n",
       "TN        120576\n",
       "BR        102204\n",
       "YW         98700\n",
       "BLK        91539\n",
       "OTHER      60245\n",
       "GREEN      58765\n",
       "GL         54851\n",
       "GRY        46527\n",
       "MR         42812\n",
       "GRAY       40854\n",
       "WHT        35433\n",
       "YELLO      32792\n",
       "WHI        29760\n",
       "OR         28100\n",
       "BK.        27830\n",
       "WT         25583\n",
       "WT.        24593\n",
       "Name: Vehicle Color, dtype: int64"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Look at the 30 most common colors, and identify colors that appear multiple times, but written differently.\n",
    "# For example, the color `WHITE` is also written as `WT` and also as `WT.` and also as `WHT`.\n",
    "\n",
    "df['Vehicle Color'].value_counts().head(30)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Prepare a Python dict in which the keys represent the various color-name inputs, \n",
    "# and the values represent the values that you want them to have in the end. \n",
    "# I suggest aiming to use the longer names, such as `WHITE`, rather than the shorter ones.\n",
    "\n",
    "colormap = {'WH': 'WHITE', \n",
    "          'GY':'GRAY', \n",
    "             'BK':'BLACK',\n",
    "             'BL':'BLUE',\n",
    "             'RD':'RED', \n",
    "             'GR':'GRAY',\n",
    "             'TN':'TAN',\n",
    "             'BR':'BROWN', \n",
    "             'YW':'YELLO', \n",
    "             'BLK':'BLACK',\n",
    "             'GRY':'GRAY', \n",
    "             'WHT':'WHITE', \n",
    "             'WHI':'WHITE', \n",
    "             'OR':'ORANG',\n",
    "             'BK.':'BLACK',\n",
    "             'WT':'WHITE',\n",
    "            'WT.':'WHITE'}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Replace the existing (old) colors with your translations.  \n",
    "df['Vehicle Color'] = df['Vehicle Color'].replace(colormap)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1879"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# How many colors are there now?\n",
    "df['Vehicle Color'].value_counts().shape[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "WHITE    3521461\n",
       "BLACK    2650853\n",
       "GRAY     2578014\n",
       "BLUE      953422\n",
       "RED       644991\n",
       "BROWN     394552\n",
       "GREY      306787\n",
       "SILVE     191477\n",
       "TAN       141667\n",
       "YELLO     131492\n",
       "OTHER      60245\n",
       "GREEN      58765\n",
       "GL         54851\n",
       "MR         42812\n",
       "ORANG      39606\n",
       "GY.        22460\n",
       "GOLD       21687\n",
       "SIL        20116\n",
       "BLU        15240\n",
       "SL.        13145\n",
       "LTGY       13055\n",
       "SL         10343\n",
       "LTG        10093\n",
       "BL.         9649\n",
       "LT/         8976\n",
       "PR          7518\n",
       "DK/         7498\n",
       "W           7367\n",
       "RD.         7128\n",
       "DKGY        6004\n",
       "GYGY        5039\n",
       "BLK.        4853\n",
       "GRN         4829\n",
       "B           4145\n",
       "WH.         3811\n",
       "BRO         3802\n",
       "DKG         3702\n",
       "PURPL       3635\n",
       "BRN         3582\n",
       "BKGY        3504\n",
       "WHBL        3489\n",
       "DKBL        2912\n",
       "GN          2883\n",
       "WHT.        2796\n",
       "BN          2787\n",
       "BLUE.       2638\n",
       "WHGY        2381\n",
       "UNKNO       2205\n",
       "RED.        2141\n",
       "BRW         2081\n",
       "Name: Vehicle Color, dtype: int64"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Look through the top 50 colors, now that you have removed a bunch of them.\n",
    "# Are there any you could still clean up? Are there any you cannot figure out? \n",
    "# Can you identify some consistent typos and errors in the colors?\n",
    "\n",
    "# There are still a lot of inconsistent colors. Many have a period (.) after the name,\n",
    "# and others have a slash (/) after the name, which is a typo probably caused by . and /\n",
    "# being next to each other on the keyboard. I'll guess that LTGY is \"light gray\", but\n",
    "# is there any standardization to the color list? What are \"MR\", \"PR\", and \"GYGY\"?\n",
    "\n",
    "# Note: Never let users enter text into a field if you can instead give them a \n",
    "# menu from which to choose. It avoids these problems!\n",
    "\n",
    "df['Vehicle Color'].value_counts().head(50)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
